Tabla de contenidos¶

  • Contructor de sentence-embeddings
    • TF-IDF
  • ES
    • KMeans
      • pseudo-matriz de confusion
      • Matriz de distancias
      • Grafico de clusters: UMAP
      • Grafico de clusters: TSNE
      • Grafico de clusters: PCA
    • Agglomerative Hierarchical
      • pseudo-matriz de confusion
      • Matriz de distancias
      • Grafico de clusters: UMAP
      • Grafico de clusters: TSNE
      • Grafico de clusters: PCA
    • DBSCAN
      • pseudo-matriz de confusion
      • Matriz de distancias
      • Grafico de clusters: UMAP
      • Grafico de clusters: TSNE
      • Grafico de clusters: PCA
    • Gaussian Mixture
      • pseudo-matriz de confusion
      • Matriz de distancias
      • Grafico de clusters: UMAP
      • Grafico de clusters: TSNE
      • Grafico de clusters: PCA
In [1]:
!pip install umap-learn
Requirement already satisfied: umap-learn in c:\users\felip\anaconda3\lib\site-packages (0.5.2)
Requirement already satisfied: scipy>=1.0 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.7.3)
Requirement already satisfied: scikit-learn>=0.22 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.0.2)
Requirement already satisfied: numba>=0.49 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (0.51.2)
Requirement already satisfied: numpy>=1.17 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.21.5)
Requirement already satisfied: tqdm in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (4.62.3)
Requirement already satisfied: pynndescent>=0.5 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (0.5.6)
Requirement already satisfied: llvmlite<0.35,>=0.34.0.dev0 in c:\users\felip\anaconda3\lib\site-packages (from numba>=0.49->umap-learn) (0.34.0)
Requirement already satisfied: setuptools in c:\users\felip\anaconda3\lib\site-packages (from numba>=0.49->umap-learn) (58.0.4)
Requirement already satisfied: joblib>=0.11 in c:\users\felip\anaconda3\lib\site-packages (from pynndescent>=0.5->umap-learn) (1.1.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\felip\anaconda3\lib\site-packages (from scikit-learn>=0.22->umap-learn) (2.2.0)
Requirement already satisfied: colorama in c:\users\felip\anaconda3\lib\site-packages (from tqdm->umap-learn) (0.4.4)
WARNING: Error parsing requirements for torch: [Errno 2] No such file or directory: 'c:\\users\\felip\\anaconda3\\lib\\site-packages\\torch-1.11.0.dist-info\\METADATA'
In [2]:
import re  
import pandas as pd 
from collections import defaultdict 
import string 
import multiprocessing
import os
import gensim
import sklearn
from sklearn import linear_model
from collections import Counter
import numpy as np
import scipy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, cohen_kappa_score, classification_report
from nltk.tokenize import word_tokenize
import pickle
import umap

# word2vec
from gensim.models import Word2Vec, KeyedVectors, FastText
from gensim.models.phrases import Phrases, Phraser
from sklearn.model_selection import train_test_split
import logging

import nltk
from nltk.stem import PorterStemmer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
In [3]:
path =  "../../Data/train/df_us_train.pickle"
df_us_train = pickle.load(open(path, "rb"))

path =  "../../Data/train/df_es_train.pickle"
df_es_train = pickle.load(open(path, "rb"))
In [4]:
n_labels_us = df_us_train["label"].unique().shape[0]
n_labels_es = df_es_train["label"].unique().shape[0]
In [5]:
from collections import Counter

punctuation = string.punctuation + "«»“”‘’…—"

stopwords_spanish = pd.read_csv(
    'https://raw.githubusercontent.com/Alir3z4/stop-words/master/spanish.txt'
).values
stopwords_spanish = Counter(stopwords_spanish.flatten().tolist())

stopwords_english = pd.read_csv(
    'https://raw.githubusercontent.com/Alir3z4/stop-words/master/english.txt'
).values
stopwords_english = Counter(stopwords_english.flatten().tolist())

Contructor de sentence-embeddings¶

TF-IDF¶

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
In [7]:
class StemmerTokenizer:
    def __init__(self, stopwords):
        self.ps = PorterStemmer()
        self.sw = stopwords
        
    def __call__(self, doc):
        doc_tok = word_tokenize(doc)
        doc_tok = [t for t in doc_tok if t not in self.sw]
        return [self.ps.stem(t) for t in doc_tok]

# Inicializamos tokenizador
tokenizador_english = StemmerTokenizer(stopwords_english)
tokenizador_spanish = StemmerTokenizer(stopwords_spanish)
In [8]:
bow_english = CountVectorizer(
    tokenizer= tokenizador_english,
    ngram_range=(1,1),
)

bow_spanish= CountVectorizer(
    tokenizer= tokenizador_spanish,
    ngram_range=(1,1),
)

ct_bow_english = ColumnTransformer([
    ("BOW", bow_english, "text")
])

ct_bow_spanish = ColumnTransformer([
    ("BOW", bow_spanish, "text")
])
In [9]:
pipe_tfidf_english = Pipeline(
    steps=[
        ('bow', ct_bow_english),
        ('tfidf', TfidfTransformer())
    ]
)

pipe_tfidf_spanish = Pipeline(
    steps=[
        ('bow', ct_bow_spanish),
        ('tfidf', TfidfTransformer())
    ]
)
In [10]:
%%time
es_train_tfidf_sentence_embedding = pipe_tfidf_spanish.fit_transform(df_es_train)
es_train_tfidf_sentence_embedding_id = df_es_train["id"]
es_train_tfidf_sentence_embedding_label = df_es_train["label"].values
Wall time: 18.2 s
In [11]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import time
from sklearn.metrics.cluster import v_measure_score
from sklearn.metrics.cluster import rand_score, homogeneity_score, completeness_score, v_measure_score
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import normalized_mutual_info_score
import plotly.express as px
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.manifold import TSNE
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from sklearn.mixture import GaussianMixture
In [12]:
from analisis_clustering import clustering_report, distance_matrix, plot_distance_matrix, projection, get_projection_df, plot_confussion_matrix 

ES¶

In [13]:
%%time
sample_index_es = np.random.RandomState(0).choice(range(es_train_tfidf_sentence_embedding.shape[0]), 20000)
X_es = es_train_tfidf_sentence_embedding[sample_index_es]
labels_es = es_train_tfidf_sentence_embedding_label[sample_index_es]
Wall time: 10 ms
In [14]:
from sklearn.decomposition import TruncatedSVD
In [15]:
X_trans = TruncatedSVD(n_components=768, n_iter=7, random_state=42).fit_transform(X_es)

KMeans¶

In [16]:
%%time
kmeans_es_report = clustering_report(
    X_trans, 
    labels_es, 
    KMeans, 
    {"n_clusters": n_labels_es, "random_state": 0}
)
Fit...
Fit (done)
Predict...
Predict (done)
Silhouette...
Silhouette (done)
Metricas...
Metricas (done)
Wall time: 9.76 s
In [17]:
pickle.dump(kmeans_es_report, open("tfidf_kmeans_es_report.pickle", "wb"))

KMeans: pseudo-matriz de confusion¶

In [ ]:
plot_confussion_matrix(labels_es, kmeans_es_report["clusters"])
In [18]:
%%time
kmeans_es_dm = distance_matrix(X_es, kmeans_es_report["clusters"], size=0.1)
Indices...
Indices (done)
Matriz de distancias...
Matriz de distancias (done)
Wall time: 58.1 ms
In [19]:
pickle.dump(kmeans_es_dm, open("tfidf_kmeans_es_dm.pickle", "wb"))

KMeans: Matriz de distancias¶

In [ ]:
plot_distance_matrix(kmeans_es_dm, q1=0.05, q3=0.69, x0=-70, scale=1.02, num_tags=5)

KMeans: Grafico de clusters: UMAP¶

In [18]:
%%time
umap_es_projection = projection(X_trans, umap.UMAP, {"random_state":0, "n_neighbors":7, "min_dist":0})
Reducer...
Reducer (done)
Wall time: 22.3 s
In [19]:
pickle.dump(umap_es_projection, open("tfidf_umap_es_projection.pickle", "wb"))
In [ ]:
df_umap_es = get_projection_df(umap_es_projection, kmeans_es_report["clusters"], labels_es)
In [ ]:
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [ ]:
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")

KMeans: Grafico de clusters: TSNE¶

In [20]:
%%time
tsne_es_projection = projection(
    X_trans, 
    TSNE, 
    {"n_components": 2, 
     "learning_rate":'auto', 
     "init":'random'}
)
Reducer...
Reducer (done)
Wall time: 1min 28s
In [21]:
pickle.dump(tsne_es_projection, open("tfidf_tsne_es_projection.pickle", "wb"))
In [17]:
df_tsne_es = get_projection_df(tsne_es_projection, kmeans_es_report["clusters"], labels_es)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_14508/1424871288.py in <module>
----> 1 df_tsne_es = get_projection_df(tsne_es_projection, kmeans_es_report["clusters"], labels_es)

NameError: name 'tsne_es_projection' is not defined
In [ ]:
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [ ]:
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")

KMeans: Grafico de clusters: PCA¶

In [22]:
pca_es_projection = projection(
    X_trans, 
    PCA, 
    {"n_components": 2}
)
Reducer...
Reducer (done)
In [23]:
pickle.dump(pca_es_projection, open("tfidf_pca_es_projection.pickle", "wb"))
In [ ]:
df_pca_es = get_projection_df(pca_es_projection, kmeans_es_report["clusters"], labels_es)
In [ ]:
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [ ]:
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")

Agglomerative Hierarchical¶

In [20]:
%%time
hc_es_report = clustering_report(
    X_trans, 
    labels_es, 
    AgglomerativeClustering, 
    {"n_clusters": n_labels_es}
)
Fit...
Fit (done)
Predict...
Predict (done)
Silhouette...
Silhouette (done)
Metricas...
Metricas (done)
Wall time: 1min 37s
In [21]:
pickle.dump(hc_es_report, open("tfidf_hc_es_report.pickle", "wb"))

Agglomerative Hierarchical: pseudo-matriz de confusion¶

In [ ]:
plot_confussion_matrix(labels_es, hc_es_report["clusters"])
In [22]:
%%time
hc_es_dm = distance_matrix(X_es, hc_es_report["clusters"], size=0.1)
Indices...
Indices (done)
Matriz de distancias...
Matriz de distancias (done)
Wall time: 56.1 ms
In [23]:
pickle.dump(hc_es_dm, open("tfidf_hc_es_dm.pickle", "wb"))

Agglomerative Hierarchical: Matriz de distancias¶

In [ ]:
plot_distance_matrix(hc_es_dm, q1=0.05, q3=0.69, x0=-70, scale=1.02, num_tags=5)

Agglomerative Hierarchical: Grafico de clusters: UMAP¶

In [ ]:
df_umap_es = get_projection_df(umap_es_projection, hc_es_report["clusters"], labels_es)
In [ ]:
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [ ]:
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")

Agglomerative Hierarchical: Grafico de clusters: TSNE¶

In [ ]:
df_tsne_es = get_projection_df(tsne_es_projection, hc_es_report["clusters"], labels_es)
In [ ]:
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [ ]:
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")

Agglomerative Hierarchical: Grafico de clusters: PCA¶

In [ ]:
df_pca_es = get_projection_df(pca_es_projection, hc_es_report["clusters"], labels_es)
In [ ]:
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [ ]:
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")

DBSCAN¶

In [24]:
%%time
dbscan_es_report = clustering_report(
    X_trans, 
    labels_es, 
    DBSCAN, 
    {"eps":0.4, "min_samples":5}
)
Fit...
Fit (done)
Predict...
Predict (done)
Silhouette...
Silhouette (done)
Metricas...
Metricas (done)
Wall time: 11.4 s
In [25]:
pickle.dump(dbscan_es_report, open("tfidf_dbscan_es_report.pickle", "wb"))

DBSCAN: pseudo-matriz de confusion¶

In [ ]:
plot_confussion_matrix(labels_es, dbscan_es_report["clusters"])
In [26]:
%%time
dbscan_es_dm = distance_matrix(X_es, dbscan_es_report["clusters"], size=0.1)
Indices...
Indices (done)
Matriz de distancias...
Matriz de distancias (done)
Wall time: 67.1 ms
In [27]:
pickle.dump(dbscan_es_dm, open("tfidf_dbscan_es_dm.pickle", "wb"))

DBSCAN: Matriz de distancias¶

In [ ]:
plot_distance_matrix(dbscan_es_dm, q1=0.05, q3=0.69, x0=-70, scale=1.02, num_tags=2)

DBSCAN: Grafico de clusters: UMAP¶

In [ ]:
df_umap_es = get_projection_df(umap_es_projection, dbscan_es_report["clusters"], labels_es)
In [ ]:
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [ ]:
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")

DBSCAN: Grafico de clusters: TSNE¶

In [ ]:
df_tsne_es = get_projection_df(tsne_es_projection, dbscan_es_report["clusters"], labels_es)
In [ ]:
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [ ]:
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")

DBSCAN: Grafico de clusters: PCA¶

In [ ]:
df_pca_es = get_projection_df(pca_es_projection, dbscan_es_report["clusters"], labels_es)
In [ ]:
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [ ]:
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")

Gaussian Mixture¶

In [28]:
%%time
gm_es_report = clustering_report(
    X_trans, 
    labels_es, 
    GaussianMixture, 
    {"n_components":n_labels_es, "random_state":0}
)
Fit...
Fit (done)
Predict...
Predict (done)
Silhouette...
Silhouette (done)
Metricas...
Metricas (done)
Wall time: 4min 9s
In [29]:
pickle.dump(gm_es_report, open("tfidf_gm_es_report.pickle", "wb"))

Gaussian Mixture: pseudo-matriz de confusion¶

In [16]:
plot_confussion_matrix(labels_es, gm_es_report["clusters"])
In [30]:
%%time
gm_es_dm = distance_matrix(X_es, gm_es_report["clusters"], size=0.1)
Indices...
Indices (done)
Matriz de distancias...
Matriz de distancias (done)
Wall time: 63.1 ms
In [31]:
pickle.dump(gm_es_dm, open("tfidf_gm_es_dm.pickle", "wb"))

Gaussian Mixture: Matriz de distancias¶

In [19]:
plot_distance_matrix(gm_es_dm, q1=0.05, q3=0.69, x0=-70, scale=1.02, num_tags=5)

Gaussian Mixture: Grafico de clusters: UMAP¶

In [21]:
df_umap_es = get_projection_df(umap_es_projection, gm_es_report["clusters"], labels_es)
In [22]:
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [23]:
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")

Gaussian Mixture: Grafico de clusters: TSNE¶

In [26]:
df_tsne_es = get_projection_df(tsne_es_projection, gm_es_report["clusters"], labels_es)
In [27]:
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [28]:
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")

Gaussian Mixture: Grafico de clusters: PCA¶

In [29]:
df_pca_es = get_projection_df(pca_es_projection, gm_es_report["clusters"], labels_es)
In [30]:
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
In [31]:
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")